suppressPackageStartupMessages({
library(DESeq2)
library(dplyr)
library(gplots)
library(ggplot2)
library(here)
library(hyperSpec)
library(limma)
library(parallel)
library(plotly)
library(tibble)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
hpal <- colorRampPalette(c("blue","white","red"))(100)
load(here("analysis/salmon/ZE-29Seed-dds.rda"))
levels(dds.29z$Experiment) <- c("ZE","Seed")
vsd <- varianceStabilizingTransformation(dds.29z,blind=FALSE)
mat <- assay(vsd)
We select FMG and ZE from mature seeds (29Seed), that correspond to Stage B8 and B9 of the ZE
sel <- dds.29z$Experiment == "Seed" | dds.29z$Time %in% c("B8","B9")
mat <- mat[,sel]
batch = dds.29z$Experiment[sel]
contrasts(batch) <- contr.sum(levels(batch))
design <- model.matrix(~batch)
fit <- lmFit(mat, design)
beta is the actual batch correction for each gene
beta <- fit$coefficients[, -1, drop = FALSE]
beta[is.na(beta)] <- 0
cross-validation with the removeBatchEffect function
stopifnot(all((mat - beta %*% t(design[,-1,drop=FALSE]))[1:6,1:6] ==
removeBatchEffect(mat,dds.29z$Experiment[sel])[1:6,1:6]))
load(here("analysis/salmon/ZE-SE-dds.rda"))
levels(dds.sz$Experiment) <- c("ZE","SE")
vsd <-varianceStabilizingTransformation(dds.sz,blind=FALSE)
fullbatch <- vsd$Experiment
contrasts(fullbatch) <- contr.sum(levels(fullbatch))
fullbatch <- model.matrix(~fullbatch)[, -1, drop = FALSE]
assay(vsd) <- assay(vsd) - beta %*% t(fullbatch)
pc <- prcomp(t(assay(vsd)))
percent <- round(summary(pc)$importance[2,]*100)
We define the number of variable of the model
nvar=2
An the number of possible combinations
nlevel=nlevels(vsd$Tissue) * nlevels(vsd$Time)
We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
as.data.frame(colData(vsd)))
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue,text=NGI.ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise - the filtering is meant to restrict the number of genes to be visualised to a reasonable amount that can be viewed in a limited amount of time
conds <- factor(paste(vsd$Time,vsd$Tissue))
sels <- rangeFeatureSelect(counts=assay(vsd),
conditions=conds,
nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot
vst.cutoff <- 6
hm <- heatmap.2(t(scale(t(assay(vsd)[sels[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=conds,cex=0.8)
dir.create(here("analysis/batchCorrection"),showWarnings=FALSE)
save(vsd,file=here("analysis/batchCorrection/vsd.rda"))
look https://www.bioconductor.org/packages//2.11/bioc/vignettes/limma/inst/doc/usersguide.pdf e.g. chapter 8.5.4 select the data to drop Stage B0, B9 and B10 (insufficient / lack of replication)
vsd <- vsd[,!vsd$Time %in% c("B0","B9","B10")]
vsd$Time <- droplevels(vsd$Time)
Stage<-vsd$Time
Tissue<-vsd$Tissue
design <- model.matrix(~Stage*Tissue)
fit <- lmFit(assay(vsd), design)
fit <- eBayes(fit)
## Warning: Zero sample variances detected, have been offset away from zero
colnames(design)
## [1] "(Intercept)" "StageB2" "StageB3" "StageB4"
## [5] "StageB5" "StageB6" "StageB7" "StageB8"
## [9] "TissueSE" "TissueZE" "StageB2:TissueSE" "StageB3:TissueSE"
## [13] "StageB4:TissueSE" "StageB5:TissueSE" "StageB6:TissueSE" "StageB7:TissueSE"
## [17] "StageB8:TissueSE" "StageB2:TissueZE" "StageB3:TissueZE" "StageB4:TissueZE"
## [21] "StageB5:TissueZE" "StageB6:TissueZE" "StageB7:TissueZE" "StageB8:TissueZE"
topTable(fit, coef=10-9)
## logFC AveExpr t P.Value adj.P.Val
## MA_9485570g0010.1 4.502863 2.882118 2001.746 5.729726e-141 3.785573e-136
## MA_9033029g0010.1 4.034120 2.737445 1793.367 3.188583e-138 1.053332e-133
## MA_6316973g0010.1 3.928945 2.704983 1746.611 1.456763e-137 3.208230e-133
## MA_10171234g0010.1 3.740690 2.646880 1662.922 2.453076e-136 4.051807e-132
## MA_193120g0010.1 3.486334 2.568375 1549.848 1.407602e-134 1.859977e-130
## MA_10429987g0010.1 2.645023 2.308711 1175.844 1.111578e-127 1.224014e-123
## MA_118237g0010.1 2.559778 2.282401 1137.948 7.313236e-127 6.902546e-123
## MA_9902954g0010.1 2.490337 2.260968 1107.078 3.556181e-126 2.936917e-122
## MA_74695g0010.1 2.453448 2.249583 1090.679 8.388890e-126 6.158284e-122
## MA_150666g0020.1 2.441221 2.245809 1085.244 1.118093e-125 7.387127e-122
## B
## MA_9485570g0010.1 279.9882
## MA_9033029g0010.1 277.9945
## MA_6316973g0010.1 277.4694
## MA_10171234g0010.1 276.4441
## MA_193120g0010.1 274.8589
## MA_10429987g0010.1 267.2728
## MA_118237g0010.1 266.2256
## MA_9902954g0010.1 265.3226
## MA_74695g0010.1 264.8234
## MA_150666g0020.1 264.6549
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] tibble_3.0.4 plotly_4.9.2.1
## [3] limma_3.46.0 hyperSpec_0.99-20201127
## [5] xml2_1.3.2 lattice_0.20-41
## [7] here_1.0.0 ggplot2_3.3.2
## [9] gplots_3.1.1 dplyr_1.0.2
## [11] DESeq2_1.30.0 SummarizedExperiment_1.20.0
## [13] Biobase_2.50.0 MatrixGenerics_1.2.0
## [15] matrixStats_0.57.0 GenomicRanges_1.42.0
## [17] GenomeInfoDb_1.26.1 IRanges_2.24.0
## [19] S4Vectors_0.28.0 BiocGenerics_0.36.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 bit64_4.0.5 RColorBrewer_1.1-2
## [4] httr_1.4.2 rprojroot_2.0.2 tools_4.0.3
## [7] R6_2.5.0 KernSmooth_2.23-18 DBI_1.1.0
## [10] lazyeval_0.2.2 colorspace_2.0-0 withr_2.3.0
## [13] tidyselect_1.1.0 bit_4.0.4 compiler_4.0.3
## [16] Cairo_1.5-12.2 DelayedArray_0.16.0 labeling_0.4.2
## [19] caTools_1.18.0 scales_1.1.1 genefilter_1.72.0
## [22] stringr_1.4.0 digest_0.6.27 rmarkdown_2.5
## [25] XVector_0.30.0 jpeg_0.1-8.1 pkgconfig_2.0.3
## [28] htmltools_0.5.0 highr_0.8 htmlwidgets_1.5.2
## [31] rlang_0.4.9 RSQLite_2.2.1 generics_0.1.0
## [34] farver_2.0.3 jsonlite_1.7.1 crosstalk_1.1.0.1
## [37] BiocParallel_1.24.1 gtools_3.8.2 RCurl_1.98-1.2
## [40] magrittr_2.0.1 GenomeInfoDbData_1.2.4 Matrix_1.2-18
## [43] Rcpp_1.0.5 munsell_0.5.0 lifecycle_0.2.0
## [46] stringi_1.5.3 yaml_2.2.1 zlibbioc_1.36.0
## [49] blob_1.2.1 crayon_1.3.4 splines_4.0.3
## [52] annotate_1.68.0 locfit_1.5-9.4 knitr_1.30
## [55] pillar_1.4.7 geneplotter_1.68.0 XML_3.99-0.5
## [58] glue_1.4.2 evaluate_0.14 latticeExtra_0.6-29
## [61] data.table_1.13.4 vctrs_0.3.5 png_0.1-7
## [64] testthat_3.0.0 gtable_0.3.0 purrr_0.3.4
## [67] tidyr_1.1.2 xfun_0.19 xtable_1.8-4
## [70] survival_3.2-7 viridisLite_0.3.0 AnnotationDbi_1.52.0
## [73] memoise_1.1.0 ellipsis_0.3.1